# -*- coding: utf-8 -*-
"""
Created on Fri Feb 20 10:43:40 2026

@author: Wieczorek_W_Station
"""

import requests
import pandas as pd
import time
import os
import pickle
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from tqdm import tqdm
import random as rd

root = "C:\\Users\\Wieczorek_W_Station\\Dropbox\\Arbeit Kassel\\paperideen\\Moltbook_Science\\Data\\"
path = os.path.join(root,"Molts")

try:
    os.makedirs(path)
except:
    pass
BASE_URL = "https://www.moltbook.com/api/v1/search"
queries = [
    "science",
    "research",
    "scientific",
    "scientist",
    "academic",
    "paper",
    "study",
    "experiment",
    "theory"
]
#%%
for query in queries:
    all_books = []
    offset = 0
    limit = 100

    while offset < 250000:
        
        params = {
            "q": query,
            "type": "all",
            "limit": limit,
            "offset": offset
        }
        
        response = requests.get(BASE_URL, params=params)
        
        data = response.json()
        
        results = data["results"]
        
        if not results:
            break
        
        for book in results:
            try:
                if book["type"] == "post":
                    entry = {
                        "id" : book.get("id"),
                        "display_name" : book.get("display_name"),
                        "name" : book.get("name"),
                        "title": book.get("title"),
                        "submolt" : book.get("submolt"),
                        "post" : book.get("post"),
                        "post_id": book.get("post_id"),
                        "content": book.get("content"),
                        "slug": book.get("slug"),
                        # "url": f"https://www.moltbook.com/books/{book.get('slug')}",
                        "author" : book.get("author"),
                        # "author_id" : book.get("author").get("id"),
                        # "author_name" : book.get("author").get("name"),
                        "relevance" : book.get("relevance"),
                        "upvotes" : book.get("upvotes"),
                        "downvotes" : book.get("downvotes"),
                        "created_at": book.get("created_at"),
                        "query" : query
                        
                    }
                else:
                    pass
                all_books.append(entry)
            except:
                pass
        
        print(f"{len(all_books)} Threads loaded. Currently at {query}")
    
        offset += limit
        
        time.sleep(2.5) 

    df = pd.DataFrame(all_books)
    df.dropna(subset = "content", inplace = True)
    # =============================================================================
    # Save the downloaded results
    # =============================================================================
    os.chdir(root)
    df.to_csv(f"Moltbook_Threads_{query}.csv", sep = ";")
    
    pickle.dump(df, open(f"Moltbook_Threads_{query}.pickle","wb"))
    
    # thread_id = "b6874146-6300-4f8d-987c-bf9fb9781ace"
#%%
os.chdir(root)
dfs = []
# df = pickle.load(open("Moltbook_Threads_multipleQueries.pickle","rb"))
for query in tqdm(queries):
    d = pickle.load(open(f"Moltbook_Threads_{query}.pickle","rb"))
    dfs.append(d)
df = pd.concat(dfs)
del(d,dfs)
#%%
os.chdir(path)
dfThreadsAll = []
dfCommentsAll = []

dfReduced = df.dropna(subset = ["post_id","title","author"])
dfReduced["lenPostId"] = [len(x) for x in dfReduced.post_id]
dfReduced = dfReduced[dfReduced.lenPostId > 0]
dfReduced.drop_duplicates(subset = "title", inplace = True)
pause = list(range(1,len(dfReduced),10))
save = list(range(50,len(dfReduced),50))

count = 0
for post_id in tqdm(list(dfReduced.post_id[count:])):
    if count in save:
        pickle.dump(pd.concat(dfThreadsAll[count-50:count]),
                    open(f"ThreadHead_{count}.pickle","wb"))
        pickle.dump(pd.concat(dfCommentsAll[count-50:count]), 
                    open(f"Comments_{count}.pickle","wb"))

    if count in pause:
        print("pausing for 5 seconds")
        time.sleep(5)
    count += 1
    # =========================================================================
    # Fetch Threads
    # ========================================================================= 
    try:
        urlAPI = f"https://www.moltbook.com/api/v1/posts/{post_id}"
        response = requests.get(urlAPI)
        
        r = str(response)
        while r == '<Response [429]>':
            time.sleep(10)
            response = requests.get(urlAPI)
            r = str(response)
        # print(response.status_code)
        ThreadHead = response.json()
        # print(ThreadHead)
        # time.sleep(0.5)
    
        dfThreadHead = pd.DataFrame(ThreadHead)
        dfThreadHead = dfThreadHead.T
        dfThreadHead = dfThreadHead[dfThreadHead.index == "post"]
        dfThreadsAll.append(dfThreadHead)
    except:
        pass
        
    # =========================================================================
    # Fetch Comments
    # =========================================================================
    try:
        url = f"https://www.moltbook.com/post/{post_id}"
        response = requests.get(url)
        # print(response.status_code)
        
        # html = bs(response.content, 'html.parser')
        # html.find("a")
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(4 + rd.random())
        
        html = driver.page_source
        soup = bs(html)
        
        comments = soup.find("div", attrs = {"class" : "mt-6"})
        commentsProperties = comments.findAll("div", attrs = {"class" : "py-2"})
        
        users = [] 
        posts = []
        
        for c in commentsProperties:
            user = c.find("a").text
            post = c.findAll("p")
            post = [p.text for p in post]
            post = "\n".join(post)
            users.append(user)
            posts.append(post)
            
        
        threadDf = pd.concat([pd.DataFrame(users, columns = ["name"]),
                  pd.DataFrame(posts, columns = ["comment"])], axis = 1)
        threadDf.insert(2,"post_id",url)
        dfCommentsAll.append(threadDf)
        driver.close()
    except:
        driver.close()
        pass

dfThreadsAll = pd.concat(dfThreadsAll)
dfCommentsAll = pd.concat(dfCommentsAll)

dfThreadsAll.to_csv("ThreadsAll01.csv", sep =";")
dfCommentsAll.to_csv("CommentsAll01.csv", sep = ";")
